#Load libraries
library(TCGAWorkflowData)
library(TCGAbiolinks)
package ‘TCGAbiolinks’ was built under R version 4.1.2Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
=============================================================
______ ___ ____ ___
|| | | | | | o __ | o _ __
|| | | ___ |___| |__ | | | | | | | | |_/ |__
|| |___ |____| | | |__| | |__| |__ | | |_| | \ __|
------------------------------------------------------------
Query, download & analyze - GDC
Version:2.22.4
==============================================================
library(SummarizedExperiment)
Loading required package: MatrixGenerics
Loading required package: matrixStats
package ‘matrixStats’ was built under R version 4.1.2
Attaching package: ‘MatrixGenerics’
The following objects are masked from ‘package:matrixStats’:
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs, colCummins,
colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs, colMads,
colMaxs, colMeans2, colMedians, colMins, colOrderStats, colProds, colQuantiles, colRanges,
colRanks, colSdDiffs, colSds, colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
colWeightedMeans, colWeightedMedians, colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs,
rowAnys, rowAvgsPerColSet, rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs,
rowMeans2, rowMedians, rowMins, rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, rowWeightedMads,
rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars
Loading required package: GenomicRanges
package ‘GenomicRanges’ was built under R version 4.1.2Loading required package: stats4
Loading required package: BiocGenerics
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated,
eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
rownames, sapply, setdiff, sort, table, tapply, union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
package ‘S4Vectors’ was built under R version 4.1.3
Attaching package: ‘S4Vectors’
The following objects are masked from ‘package:base’:
expand.grid, I, unname
Loading required package: IRanges
Loading required package: GenomeInfoDb
package ‘GenomeInfoDb’ was built under R version 4.1.2Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor,
see 'citation("Biobase")', and for packages 'citation("pkgname")'.
Attaching package: ‘Biobase’
The following object is masked from ‘package:MatrixGenerics’:
rowMedians
The following objects are masked from ‘package:matrixStats’:
anyMissing, rowMedians
library(edgeR)
Loading required package: limma
package ‘limma’ was built under R version 4.1.3
Attaching package: ‘limma’
The following object is masked from ‘package:BiocGenerics’:
plotMA
library(EnhancedVolcano)
Loading required package: ggplot2
package ‘ggplot2’ was built under R version 4.1.2Loading required package: ggrepel
Registered S3 methods overwritten by 'ggalt':
method from
grid.draw.absoluteGrob ggplot2
grobHeight.absoluteGrob ggplot2
grobWidth.absoluteGrob ggplot2
grobX.absoluteGrob ggplot2
grobY.absoluteGrob ggplot2
library(Seurat)
package ‘Seurat’ was built under R version 4.1.2Attaching SeuratObject
Attaching sp
Attaching package: ‘Seurat’
The following object is masked from ‘package:SummarizedExperiment’:
Assays
library(lumi)
Setting options('download.file.method.GEOquery'='auto')
Setting options('GEOquery.inmemory.gpl'=FALSE)
No methods found in package ‘RSQLite’ for request: ‘dbListFields’ when loading ‘lumi’
library(tidyverse)
── Attaching packages ────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ tibble 3.1.7 ✔ dplyr 1.0.9
✔ tidyr 1.2.0 ✔ stringr 1.4.0
✔ readr 2.1.2 ✔ forcats 0.5.1
✔ purrr 0.3.4
package ‘tibble’ was built under R version 4.1.2package ‘tidyr’ was built under R version 4.1.2package ‘readr’ was built under R version 4.1.2package ‘dplyr’ was built under R version 4.1.2── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::collapse() masks IRanges::collapse()
✖ dplyr::combine() masks lumi::combine(), Biobase::combine(), BiocGenerics::combine()
✖ dplyr::count() masks matrixStats::count()
✖ dplyr::desc() masks IRanges::desc()
✖ tidyr::expand() masks S4Vectors::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::first() masks S4Vectors::first()
✖ dplyr::lag() masks stats::lag()
✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
✖ dplyr::rename() masks S4Vectors::rename()
✖ dplyr::slice() masks IRanges::slice()
options(ggrepel.max.overlaps = Inf)
#Load TCGA annotations
mrna_query <- GDCquery(project = "TCGA-PRAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts")
--------------------------------------
o GDCquery: Searching in GDC database
--------------------------------------
Genome of reference: hg38
--------------------------------------------
oo Accessing GDC. This might take a while...
--------------------------------------------
ooo Project: TCGA-PRAD
--------------------
oo Filtering results
--------------------
ooo By data.type
ooo By workflow.type
----------------
oo Checking data
----------------
ooo Check if there are duplicated cases
ooo Check if there results for the query
-------------------
o Preparing output
-------------------
mrna_query$results[[1]]
nrml_meta <- mrna_query$results[[1]] %>% dplyr::filter(sample_type == "Solid Tissue Normal")
#GDCdownload(mrna_query, method = "api", directory = "/Users/ds/Desktop")
#Process normal prostate counts
full_path <- "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/"
nrml_rna <- do.call(cbind, lapply(list.files(path = full_path), FUN = function(f){
print(paste0("processing: ",list.files(path = paste0(full_path, f))))
counts <- read_tsv(file = list.files(path = paste0(full_path, f), full.names = T), skip = 6, col_names = F )
return(data.frame(f = counts[,4]))
}))
symbols <- read_tsv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/0427d696-a33d-4005-ac1d-c5363754226e/b1becd71-bbed-4a0f-af9c-94b9b2af928b.rna_seq.augmented_star_gene_counts.tsv", skip = 6, col_names = F)[,2]
colnames(nrml_rna) <- list.files(path = full_path)
nrml_rna$Gene <- symbols$X2
write.table(x = nrml_rna, file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/normal_rna_counts.txt", quote = F, sep = "\t", row.names = F, col.names = T)
#Change IDS for normals
nrml_rna <- read.table(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Normal/normal_rna_counts.txt", header = T)
nrml_rna <- nrml_rna[!duplicated(nrml_rna$Gene),]
row.names(nrml_rna) <- nrml_rna$Gene
nrml_rna <- dplyr::select(nrml_rna, -Gene)
colnames(nrml_rna) <- colnames(nrml_rna) %>% gsub(pattern = "^X*", replacement = "")
colnames(nrml_rna) <- colnames(nrml_rna) %>% gsub(pattern = "\\.", replacement = "-")
nrml_rna <- cpm(nrml_rna, log = T) %>% t() %>% data.frame()#Log normalize
#nrml_rna <- nrml_rna %>% t() %>% data.frame()
nrml_rna <- nrml_rna %>% mutate(id = row.names(.), batch = "Normal") %>% merge(dplyr::select(nrml_meta , c(id, cases.submitter_id) ))
nrml_rna <- nrml_rna %>% mutate(cases = cases.submitter_id, Purity = NA, Grade_Mutational_status =NA)
#Download TCGA PRAD sample list
prad_meta<- mrna_query$results[[1]] %>% dplyr::filter(sample_type == "Primary Tumor")
#GDCdownload(mrna_query, method = "api", directory = "/Users/ds/Desktop")
#Process PRAD counts
full_path <- "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/"
prad_rna <- do.call(cbind, lapply(list.files(path = full_path), FUN = function(f){
print(paste0("processing: ",list.files(path = paste0(full_path, f))))
counts <- read_tsv(file = list.files(path = paste0(full_path, f), full.names = T), skip = 6, col_names = F )
return(data.frame(f = counts[,4]))
}))
#prad_rna <- cpm(prad_rna, log = T, prior.count = 1) %>% data.frame()
colnames(prad_rna) <- list.files(path = full_path)
symbols <- read_tsv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/0007888f-8d96-4c01-8251-7fef6cc71596/88215dd0-5841-44f1-9393-eefd8238cbb3.rna_seq.augmented_star_gene_counts.tsv", skip = 6, col_names = F)[,2]
prad_rna$Gene <- symbols$X2
#write.table(x = prad_rna, file = "/Users/ds/Desktop/TCGA/TCGA-PRAD-Tumor/prad_rna_cpm.txt", quote = F, sep = "\t", row.names = F, col.names = T)
write.table(x = prad_rna, file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/prad_rna_counts.txt", quote = F, sep = "\t", row.names = F, col.names = T)
#Format PRAD count data
prad_rna$Grade_Mutational_status
[1] 7 6 7 7 7 6 7 7 7 9 7 8 7 7 7 7 7 9 7 6 6 7 7 6 7 7 9 7 7 7 6 6 7 6 6 7 6 7 7 9 9 6 8 9 7 6 9 7 7
[50] 7 7 7 8 6 9 7 7 7 7 7 7 8 6 7 7 7 6 7 6 7 7 8 6 7 9 7 7 7 6 7 7 6 8 7 7 9 7 7 7 7 7 7 6 7 7 7 8 8
[99] 6 7 7 8 7 7 7 9 7 9 6 6 9 8 6 7 8 7 6 6 7 8 7 8 9 7 6 9 6 7 7 7 6 7 7 8 7 7 7 7 9 8 7 6 6 9 7 8 7
[148] 8 7 7 7 7 7 6 7 7 7 6 6 7 9 7 7 8 6 6 9 9 6 8 7 7 9 6 7 9 7 9 7 7 9 6 8 9 7 6 7 7 8 7 9 8 9 7 8 7
[197] 8 7 8 8 7 7 7 6 9 6 6 7 6 8 6 7 8 7 7 6 6 9 7 6 7 7 7 7 7 6 7 7 6 7 7 7 6 6 7 7 7 7 7 7 9 6 8 7 7
[246] 7 8 8 7 6 7 8 6 7 8 7 9 6 8 8 8 6 9 9 7 7 7 7 8 7 9 9 7 7 7 7 8 7 7 7 8 7 7 7 7 8 9 7 7 7 7 7 7 6
[295] 9 7 6 7 6 8 9 7 7 6 7 7 7 6 6 7 7 8 6 7 7 8 7 8 9 6 7 7 6 8 7 9 7 7 9 7 7 7 9 7 7 10
#Add PRAD purity from GDC website
anno_prad <- read.csv(file = "/Users/ds/Desktop/projects/TCGA/TCGA-PRAD-Tumor/prad_supp.csv", header = T)
anno_prad <- anno_prad %>% dplyr::select(avgRNA_purity, PATIENT_ID, Reviewed_Gleason_sum)
prad_rna <- inner_join(prad_rna, anno_prad, by = c("cases.submitter_id" = "PATIENT_ID")) %>% mutate(Purity = avgRNA_purity * 100, cases = cases.submitter_id, Grade_Mutational_status = Reviewed_Gleason_sum)
anno_prad
NA
#Format mCRPC count data
anno_mcrpc <- read.table("/Users/ds/Desktop/projects/data/anno/202009_deepRNAseq_sample_full.txt", header =T)
anno_mcrpc <- anno_mcrpc %>% dplyr::select(c(sample_id, wgs_id, biopsy_site,tumor_purity_wgs, tumor_purity_rna , tumor_purity_hist, AR))
anno_mcrpc <- anno_mcrpc %>% mutate(cases = sample_id, Purity = tumor_purity_rna, Grade_Mutational_status =AR) %>% dplyr::select(cases, Purity, Grade_Mutational_status)
mcrpc_rna <- read.table(file = "/Users/ds/Desktop/projects/data/rna/mCRPC_RNA_counts_genename.txt", header = T)
mcrpc_rna <- mcrpc_rna[!duplicated(mcrpc_rna$Genename),]
row.names(mcrpc_rna) <- mcrpc_rna$Genename
mcrpc_rna <- mcrpc_rna[,-1]
mcrpc_rna <- cpm(mcrpc_rna, log = T) %>% t() %>% data.frame()#Log normalize for heatmap
#mcrpc_rna <- mcrpc_rna %>% t() %>% data.frame()#Counts for DEG analysis
row.names(mcrpc_rna) <- gsub("\\.", "-", x = row.names(mcrpc_rna))
mcrpc_rna <- mcrpc_rna %>% mutate(cases = row.names(.), batch = "mCRPC")
#Add mCRPC purity
mcrpc_rna <- mcrpc_rna %>% merge(anno_mcrpc)
#Merge PRAD and mCRPC datasets (DEG analysis)
#Merge PRAD, Normal, and mCRPC datasets (Heatmap)
query <- intersect(colnames(prad_rna), colnames(mcrpc_rna)) %>% intersect( colnames(nrml_rna))#Get lncRNAs in common + cases + purity
#Combine PRAD and mCRPC - rows are samples and columns are lncRNAs + RNA purity + batch
comb_rna <- rbind( nrml_rna[,query], arrange(prad_rna[,query],Grade_Mutational_status), arrange(mcrpc_rna[,query],Grade_Mutational_status))
comb_rna <- data.frame(comb_rna[,c("batch", "Purity", "cases", "Grade_Mutational_status")], comb_rna[,colnames(comb_rna) %in% c(pca_lncrna, notpca_lncrna)])
comb_rna <- comb_rna %>% mutate(cases = paste0(cases, ":", 1:nrow(comb_rna)))
row.names(comb_rna) <- comb_rna$cases
comb_rna$batch %>% table()
.
mCRPC Normal PRAD
74 52 336
#Plotting heatmap of progression for prostate lncRNA
#Plot lncRNAs in TME
length(filt)
[1] 49